Load libraries

list.of.packages <- c("tidyverse", "reshape2", "here", "methylKit", "ggforce", "matrixStats", "Pstat", "viridis", "plotly", "readr", "GenomicRanges", "vegan", "factoextra") #add new libraries here 

new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# Load all libraries 
lapply(list.of.packages, FUN = function(X) {
  do.call("require", list(X)) 
})

Obtain session information

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.15.7
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] factoextra_1.0.7     vegan_2.5-6          lattice_0.20-41     
##  [4] permute_0.9-5        plotly_4.9.2.1       viridis_0.5.1       
##  [7] viridisLite_0.3.0    Pstat_1.2            matrixStats_0.56.0  
## [10] ggforce_0.3.1        methylKit_1.8.1      GenomicRanges_1.34.0
## [13] GenomeInfoDb_1.18.2  IRanges_2.16.0       S4Vectors_0.20.1    
## [16] BiocGenerics_0.28.0  here_0.1             reshape2_1.4.4      
## [19] forcats_0.5.0        stringr_1.4.0        dplyr_0.8.5         
## [22] purrr_0.3.4          readr_1.3.1          tidyr_1.0.3         
## [25] tibble_3.0.1         ggplot2_3.3.0        tidyverse_1.3.0     
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.4-1            ellipsis_0.3.0             
##  [3] mclust_5.4.5                rprojroot_1.3-2            
##  [5] qvalue_2.14.1               XVector_0.22.0             
##  [7] fs_1.4.1                    rstudioapi_0.11            
##  [9] farver_2.0.3                ggrepel_0.8.2              
## [11] fansi_0.4.1                 mvtnorm_1.1-0              
## [13] lubridate_1.7.8             xml2_1.3.2                 
## [15] splines_3.5.1               R.methodsS3_1.8.0          
## [17] knitr_1.28                  polyclip_1.10-0            
## [19] jsonlite_1.6.1              Rsamtools_1.34.1           
## [21] broom_0.5.6                 cluster_2.1.0              
## [23] dbplyr_1.4.3                R.oo_1.23.0                
## [25] compiler_3.5.1              httr_1.4.1                 
## [27] backports_1.1.6             lazyeval_0.2.2             
## [29] assertthat_0.2.1            Matrix_1.2-18              
## [31] limma_3.38.3                cli_2.0.2                  
## [33] tweenr_1.0.1                htmltools_0.4.0            
## [35] tools_3.5.1                 coda_0.19-3                
## [37] gtable_0.3.0                glue_1.4.0                 
## [39] GenomeInfoDbData_1.2.0      Rcpp_1.0.4.6               
## [41] bbmle_1.0.23.1              Biobase_2.42.0             
## [43] cellranger_1.1.0            vctrs_0.3.0                
## [45] Biostrings_2.50.2           nlme_3.1-143               
## [47] rtracklayer_1.42.2          xfun_0.13                  
## [49] fastseg_1.28.0              rvest_0.3.5                
## [51] lifecycle_0.2.0             gtools_3.8.2               
## [53] XML_3.99-0.3                zlibbioc_1.28.0            
## [55] MASS_7.3-51.6               scales_1.1.1               
## [57] hms_0.5.3                   SummarizedExperiment_1.12.0
## [59] yaml_2.2.1                  gridExtra_2.3              
## [61] emdbook_1.3.12              bdsmatrix_1.3-4            
## [63] stringi_1.4.6               BiocParallel_1.16.6        
## [65] rlang_0.4.6                 pkgconfig_2.0.3            
## [67] bitops_1.0-6                evaluate_0.14              
## [69] htmlwidgets_1.5.1           GenomicAlignments_1.18.1   
## [71] tidyselect_1.1.0            plyr_1.8.6                 
## [73] magrittr_1.5                R6_2.4.1                   
## [75] generics_0.0.2              DelayedArray_0.8.0         
## [77] DBI_1.1.0                   mgcv_1.8-31                
## [79] pillar_1.4.4                haven_2.2.0                
## [81] withr_2.2.0                 RCurl_1.98-1.2             
## [83] modelr_0.1.7                crayon_1.3.4               
## [85] rmarkdown_2.1               grid_3.5.1                 
## [87] readxl_1.3.1                data.table_1.12.8          
## [89] reprex_0.3.0                digest_0.6.25              
## [91] numDeriv_2016.8-1.1         R.utils_2.9.2              
## [93] munsell_0.5.0

Check current directory

getwd()
## [1] "/Volumes/Bumblebee/C.magister_methyl-oa/notebooks"

Download files from gannet

Create list of all bismark data files, which have reads that are already trimmed and aligned. These BAM files are also sorted and indexed.

IMPORTANT NOTE: The location of these files depends on where the user saved them. I (Laura) used an external hard drive.

Files were transferred from Hyak on December 27th, 2020 using rsync.

file.list=list("/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH01-06_S1_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH01-14_S2_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH01-22_S3_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH01-38_S4_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH03-04_S5_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH03-15_S6_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH03-33_S7_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH05-01_S8_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH05-06_S9_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH05-21_S10_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH05-24_S11_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH07-06_S12_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH07-11_S13_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH07-24_S14_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH09-02_S15_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH09-13_S16_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH09-28_S17_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH10-01_S18_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH10-08_S19_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH10-11_S20_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam")

Read in MiSeq bismark summary with sample treatment info

summary <- read_delim(file="../qc-processing/MiSeq-QC/mbdall.txt", delim="\t") %>%
  arrange(sample_mbd) %>% filter(sample_mbd != "NA")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   sample = col_character(),
##   treatment = col_character(),
##   treatment_high_lowCO2 = col_character(),
##   developmental_stage = col_character(),
##   notes = col_character(),
##   DNAiso_batch = col_character(),
##   `pool for library?` = col_character()
## )
## See spec(...) for full column specifications.
head(summary)
## # A tibble: 6 x 29
##   sample sample_mbd  tank treatment treatment_high_… developmental_s… notes
##   <chr>       <dbl> <dbl> <chr>     <chr>            <chr>            <chr>
## 1 CH01-…          1     3 LC        L                J7               <NA> 
## 2 CH01-…          2     3 LC        L                J6               <NA> 
## 3 CH01-…          3     1 LB        L                J6               <NA> 
## 4 CH01-…          4     1 LB        L                J6               <NA> 
## 5 CH03-…          5     1 LB        L                J7               <NA> 
## 6 CH03-…          6     1 LB        L                J6               <NA> 
## # … with 22 more variables: DNAiso_batch <chr>, conc_ng.uL <dbl>,
## #   yield_ug <dbl>, MBD_concentration_ng.uL <dbl>, Total_recovery_ng <dbl>,
## #   Percent_recovery <dbl>, MBD_date <dbl>,
## #   library_bioanlyzer_mean_fragment_size_bp <dbl>,
## #   library_bioanalyzer_molarity_pmol.L <dbl>, `pool for library?` <chr>,
## #   qubit_concentration_ng.uL <dbl>, qubit_molarity_nM <dbl>,
## #   library_4nM_uL <dbl>, Total_Reads <dbl>, Aligned_Reads <dbl>,
## #   Unaligned_Reads <dbl>, Ambiguously_Aligned_Reads <dbl>, Unique_reads <dbl>,
## #   perc_totalread_unique <dbl>, CpGs_Meth <dbl>, CHGs_Meth <dbl>,
## #   CHHs_Meth <dbl>
as.numeric(as.factor(summary$treatment_high_lowCO2))
##  [1] 2 2 2 2 2 2 2 1 1 1 1 2 2 2 1 1 1 1 1 1

The following command reads sorted BAM files and calls methylation percentage per base, and creates a methylRaw object for CpG methylation.

It also assigns minimum coverage of 2x and the treatments (in this case, the Olympia oyster population)

myobj_MiSeq = processBismarkAln(location = file.list, sample.id = list("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18", "19", "20"), assembly = "pilon_scaffolds.fasta", read.context="CpG", mincov=2, treatment = as.numeric(as.factor(summary$treatment_high_lowCO2)))
## Conversion Statistics:
## 
## total otherC considered (>95% C+T): 2378671
## average conversion rate = 96.065256012691
## total otherC considered (Forward) (>95% C+T): 1193250
## average conversion rate (Forward) = 96.137759544132
## total otherC considered (Reverse) (>95% C+T): 1185421
## average conversion rate (Reverse) = 95.992273638587
## 
## Reading methylation percentage per base for sample: 1 
## 
## Conversion Statistics:
## 
## total otherC considered (>95% C+T): 2217796
## average conversion rate = 94.672982595648
## total otherC considered (Forward) (>95% C+T): 1112302
## average conversion rate (Forward) = 94.624143840445
## total otherC considered (Reverse) (>95% C+T): 1105494
## average conversion rate (Reverse) = 94.722122116161
## 
## Reading methylation percentage per base for sample: 2 
## 
## Conversion Statistics:
## 
## total otherC considered (>95% C+T): 2263720
## average conversion rate = 95.581407672606
## total otherC considered (Forward) (>95% C+T): 1131606
## average conversion rate (Forward) = 95.543853429588
## total otherC considered (Reverse) (>95% C+T): 1132114
## average conversion rate (Reverse) = 95.618945064358
## 
## Reading methylation percentage per base for sample: 3 
## 
## Conversion Statistics:
## 
## total otherC considered (>95% C+T): 3645550
## average conversion rate = 94.13230957402
## total otherC considered (Forward) (>95% C+T): 1823926
## average conversion rate (Forward) = 94.177995997738
## total otherC considered (Reverse) (>95% C+T): 1821624
## average conversion rate (Reverse) = 94.086565416023
## 
## Reading methylation percentage per base for sample: 4 
## 
## Conversion Statistics:
## 
## total otherC considered (>95% C+T): 2436450
## average conversion rate = 87.274885945743
## total otherC considered (Forward) (>95% C+T): 1220235
## average conversion rate (Forward) = 87.229065553861
## total otherC considered (Reverse) (>95% C+T): 1216215
## average conversion rate (Reverse) = 87.320857789446
## 
## Reading methylation percentage per base for sample: 5 
## 
## Conversion Statistics:
## 
## total otherC considered (>95% C+T): 932616
## average conversion rate = 91.078253451476
## total otherC considered (Forward) (>95% C+T): 470176
## average conversion rate (Forward) = 90.915530223976
## total otherC considered (Reverse) (>95% C+T): 462440
## average conversion rate (Reverse) = 91.243698819983
## 
## Reading methylation percentage per base for sample: 6 
## 
## Conversion Statistics:
## 
## total otherC considered (>95% C+T): 2933688
## average conversion rate = 96.012887683317
## total otherC considered (Forward) (>95% C+T): 1469518
## average conversion rate (Forward) = 95.998314448791
## total otherC considered (Reverse) (>95% C+T): 1464170
## average conversion rate (Reverse) = 96.027514147767
## 
## Reading methylation percentage per base for sample: 7 
## 
## Conversion Statistics:
## 
## total otherC considered (>95% C+T): 2894259
## average conversion rate = 91.696373964204
## total otherC considered (Forward) (>95% C+T): 1451879
## average conversion rate (Forward) = 91.686472602622
## total otherC considered (Reverse) (>95% C+T): 1442380
## average conversion rate (Reverse) = 91.706340532621
## 
## Reading methylation percentage per base for sample: 8 
## 
## Conversion Statistics:
## 
## total otherC considered (>95% C+T): 682586
## average conversion rate = 91.904421357223
## total otherC considered (Forward) (>95% C+T): 344535
## average conversion rate (Forward) = 91.717234146634
## total otherC considered (Reverse) (>95% C+T): 338051
## average conversion rate (Reverse) = 92.09519891919
## 
## Reading methylation percentage per base for sample: 9 
## 
## Conversion Statistics:
## 
## total otherC considered (>95% C+T): 3405977
## average conversion rate = 94.817017347492
## total otherC considered (Forward) (>95% C+T): 1706831
## average conversion rate (Forward) = 94.844422590115
## total otherC considered (Reverse) (>95% C+T): 1699146
## average conversion rate (Reverse) = 94.789488154786
## 
## Reading methylation percentage per base for sample: 10 
## 
## Conversion Statistics:
## 
## total otherC considered (>95% C+T): 2241785
## average conversion rate = 94.636999287808
## total otherC considered (Forward) (>95% C+T): 1123557
## average conversion rate (Forward) = 94.683341447348
## total otherC considered (Reverse) (>95% C+T): 1118228
## average conversion rate (Reverse) = 94.590436281206
## 
## Reading methylation percentage per base for sample: 11 
## 
## Conversion Statistics:
## 
## total otherC considered (>95% C+T): 3050774
## average conversion rate = 94.773405914057
## total otherC considered (Forward) (>95% C+T): 1528623
## average conversion rate (Forward) = 94.788361353695
## total otherC considered (Reverse) (>95% C+T): 1522151
## average conversion rate (Reverse) = 94.758386885717
## 
## Reading methylation percentage per base for sample: 12 
## 
## Conversion Statistics:
## 
## total otherC considered (>95% C+T): 1180361
## average conversion rate = 92.2459727698
## total otherC considered (Forward) (>95% C+T): 592922
## average conversion rate (Forward) = 92.389147002188
## total otherC considered (Reverse) (>95% C+T): 587439
## average conversion rate (Reverse) = 92.101462187058
## 
## Reading methylation percentage per base for sample: 13 
## 
## Conversion Statistics:
## 
## total otherC considered (>95% C+T): 1455976
## average conversion rate = 92.749181512779
## total otherC considered (Forward) (>95% C+T): 730372
## average conversion rate (Forward) = 92.582384657779
## total otherC considered (Reverse) (>95% C+T): 725604
## average conversion rate (Reverse) = 92.917074402814
## 
## Reading methylation percentage per base for sample: 14 
## 
## Conversion Statistics:
## 
## total otherC considered (>95% C+T): 4367475
## average conversion rate = 93.409381942564
## total otherC considered (Forward) (>95% C+T): 2186605
## average conversion rate (Forward) = 93.49627739973
## total otherC considered (Reverse) (>95% C+T): 2180870
## average conversion rate (Reverse) = 93.322257977762
## 
## Reading methylation percentage per base for sample: 15 
## 
## Conversion Statistics:
## 
## total otherC considered (>95% C+T): 1486670
## average conversion rate = 96.189155254543
## total otherC considered (Forward) (>95% C+T): 746735
## average conversion rate (Forward) = 96.140865558575
## total otherC considered (Reverse) (>95% C+T): 739935
## average conversion rate (Reverse) = 96.237888732644
## 
## Reading methylation percentage per base for sample: 16 
## 
## Conversion Statistics:
## 
## total otherC considered (>95% C+T): 3911508
## average conversion rate = 94.158000028965
## total otherC considered (Forward) (>95% C+T): 1956696
## average conversion rate (Forward) = 94.175367140687
## total otherC considered (Reverse) (>95% C+T): 1954812
## average conversion rate (Reverse) = 94.140616179245
## 
## Reading methylation percentage per base for sample: 17 
## 
## Conversion Statistics:
## 
## total otherC considered (>95% C+T): 4193526
## average conversion rate = 94.676508186089
## total otherC considered (Forward) (>95% C+T): 2097370
## average conversion rate (Forward) = 94.705883728718
## total otherC considered (Reverse) (>95% C+T): 2096156
## average conversion rate (Reverse) = 94.647115630458
## 
## Reading methylation percentage per base for sample: 18 
## 
## Conversion Statistics:
## 
## total otherC considered (>95% C+T): 2738025
## average conversion rate = 90.855175786794
## total otherC considered (Forward) (>95% C+T): 1372577
## average conversion rate (Forward) = 90.921646931751
## total otherC considered (Reverse) (>95% C+T): 1365448
## average conversion rate (Reverse) = 90.788357596184
## 
## Reading methylation percentage per base for sample: 19 
## 
## Conversion Statistics:
## 
## total otherC considered (>95% C+T): 5796052
## average conversion rate = 94.406651018968
## total otherC considered (Forward) (>95% C+T): 2905545
## average conversion rate (Forward) = 94.349079789785
## total otherC considered (Reverse) (>95% C+T): 2890507
## average conversion rate (Reverse) = 94.464521765207
## 
## Reading methylation percentage per base for sample: 20

Save the MethylKit object; re-doing the previous step is memory/time intensive, so best to use the saved object moving forward.

getwd()
## [1] "/Volumes/Bumblebee/C.magister_methyl-oa/notebooks"
save(myobj_MiSeq, file = "../qc-processing/MiSeq-QC/myobj_MiSeq") 

compress MethylKit object, if desired

#zip ../analyses/myobj_MiSeq.zip ../analyses/myobj_MiSeq

Load object if needed

#load("../qc-processing/MiSeq-QC/myobj_MiSeq")

What does the resulting object look like?

# Check out data for sample #1 
head(myobj_MiSeq[[1]])
##               chr start   end strand coverage numCs numTs
## 1  contig_1_pilon 24401 24401      -        2     0     2
## 2  contig_1_pilon 24409 24409      -        2     0     2
## 3  contig_1_pilon 24419 24419      -        2     0     2
## 4 contig_10_pilon 34002 34002      +        2     0     2
## 5 contig_10_pilon 51920 51920      -        2     0     2
## 6 contig_10_pilon 55069 55069      +        2     0     2

Check the basic stats about the methylation data - coverage and percent methylation. Index the object to look through each sample

# First look at % CpG methylation (panels)
for (i in 1:20) {
getMethylationStats(myobj_MiSeq[[i]],plot=T,both.strands=TRUE)
     mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}

# Now look at coverage 
for (i in 1:20) {
 getCoverageStats(myobj_MiSeq[[i]],plot=TRUE,both.strands=TRUE) 
   mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}

Filter out loci where coverage is less than 5x or greater than 100x.

MiSeq_5x=filterByCoverage(myobj_MiSeq,lo.count=5,lo.perc=NULL,
                                      hi.count=100,hi.perc=NULL)
save(MiSeq_5x, file="../qc-processing/MiSeq-QC/MiSeq_5x")

Now look at CpG methylation after filtering for 5x coverage

for (i in 1:20) {
getMethylationStats(MiSeq_5x[[i]],plot=T,both.strands=TRUE)
  mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}

Filter out loci where coverage is less than 10x or greater than 100x.

MiSeq_10x=filterByCoverage(myobj_MiSeq,lo.count=10,lo.perc=NULL,
                                      hi.count=100,hi.perc=NULL)
save(MiSeq_10x, file="../qc-processing/MiSeq-QC/MiSeq_10x")

Now look at CpG methylation after filtering for 10x coverage

for (i in 1:20) {
getMethylationStats(MiSeq_10x[[i]],plot=T,both.strands=TRUE)
  mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}

Column bind all samples, and destrand.

meth_unite=methylKit::unite(myobj_MiSeq, destrand=TRUE, min.per.group=1L)
#save(meth_unite, file = "../qc-processing/MiSeq-QC/meth_unite") #save object to file

Reshape data to do various filtering and calculations

meth_unite_reshaped <- melt(data=meth_unite, id=c("chr", "start", "end", "strand"), value.name = "count") %>%
 tidyr::separate(col = "variable" , into = c("variable", "sample"), sep = "(?<=[a-zA-Z])\\s*(?=[0-9])") %>%
  dcast(chr+start+end+strand+sample ~  variable) %>%
  drop_na(coverage) %>% 
  mutate(percMeth = 100*(numCs/coverage)) %>% 
  mutate(sample=as.numeric(sample))
## Using count as value column: use value.var to override.
head(meth_unite_reshaped)
##                 chr start   end strand sample coverage numCs numTs percMeth
## 1 contig_1003_pilon 25122 25122      +      1       10    10     0      100
## 2 contig_1003_pilon 25143 25143      +      1        7     7     0      100
## 3 contig_1003_pilon 25146 25146      +      1        3     3     0      100
## 4 contig_1003_pilon 25162 25162      +      1        5     5     0      100
## 5 contig_1003_pilon 25166 25166      +      1        5     3     2       60
## 6 contig_1003_pilon 25186 25186      +      1        7     7     0      100

No coverage threshold, what is mean % methylated and how many loci are there?

meth_unite_reshaped %>% 
  group_by(sample) %>%
  summarize(mean = mean(percMeth), nloci = n())
## # A tibble: 20 x 3
##    sample  mean  nloci
##     <dbl> <dbl>  <int>
##  1      1  77.9 307273
##  2      2  76.0 280847
##  3      3  72.3 258637
##  4      4  74.8 416299
##  5      5  35.2 132251
##  6      6  48.9  64462
##  7      7  77.0 347761
##  8      8  68.5 280665
##  9      9  17.8  31092
## 10     10  72.6 372596
## 11     11  78.3 299331
## 12     12  72.7 333387
## 13     13  24.1  51783
## 14     14  21.6  60153
## 15     15  74.9 431389
## 16     16  65.5 139884
## 17     17  71.0 337914
## 18     18  74.8 414587
## 19     19  56.7 206273
## 20     20  72.7 491751

5x coverage, what is mean % methylated and how many loci are there?

meth_unite_reshaped %>% 
  filter(coverage>=5) %>% 
  group_by(sample) %>%
  summarize(mean = mean(percMeth,), nloci = n())
## # A tibble: 20 x 3
##    sample  mean  nloci
##     <dbl> <dbl>  <int>
##  1      1 83.5  150500
##  2      2 81.4   97256
##  3      3 78.8  113501
##  4      4 79.9  190444
##  5      5  9.69   6078
##  6      6 61.0    5172
##  7      7 82.3  197277
##  8      8 76.4   53183
##  9      9  1.84   1418
## 10     10 78.6  165889
## 11     11 83.7  121088
## 12     12 80.9  158257
## 13     13  2.41   2773
## 14     14  2.18   3054
## 15     15 81.5  224040
## 16     16 69.3   10767
## 17     17 80.6  158567
## 18     18 80.2  240812
## 19     19 59.6   16414
## 20     20 79.4  288343

10x coverage, what is mean % methylated and how many loci are there?

meth_unite_reshaped %>% 
  filter(coverage>=10) %>% 
  group_by(sample) %>%
  summarize(mean = mean(percMeth), nloci = n())
## # A tibble: 20 x 3
##    sample  mean  nloci
##     <dbl> <dbl>  <int>
##  1      1 84.7   75634
##  2      2 82.7   25360
##  3      3 80.0   56056
##  4      4 81.4   66130
##  5      5  3.15   1908
##  6      6 51.5     846
##  7      7 83.5  113142
##  8      8 72.6    3528
##  9      9  1.38    366
## 10     10 80.1   48507
## 11     11 85.2   36303
## 12     12 82.7   78369
## 13     13  2.29    864
## 14     14  1.88    919
## 15     15 83.3   85681
## 16     16 54.1     417
## 17     17 83.9   82386
## 18     18 81.6  127130
## 19     19 21.0    1281
## 20     20 81.1  132739

What is mean % methylated with 15x coverage?

meth_unite_reshaped %>% 
  filter(coverage>=15) %>% 
  group_by(sample) %>%
  summarize(mean = mean(percMeth), n = n())
## # A tibble: 20 x 3
##    sample  mean     n
##     <dbl> <dbl> <int>
##  1      1 85.2  39878
##  2      2 83.0   5656
##  3      3 80.6  30239
##  4      4 81.8  19507
##  5      5  2.64   910
##  6      6 29.8    214
##  7      7 84.3  68228
##  8      8 54.9    336
##  9      9  1.03   154
## 10     10 80.4  11926
## 11     11 85.5   9520
## 12     12 83.5  41526
## 13     13  1.96   379
## 14     14  1.49   419
## 15     15 84.1  27937
## 16     16 36.3     74
## 17     17 85.3  52617
## 18     18 82.3  66801
## 19     19 14.0    465
## 20     20 82.0  52841

Out of interest, do cluster analysis

clusterSamples(meth_unite, dist="correlation", method="ward", plot=TRUE)
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"

## 
## Call:
## hclust(d = d, method = HCLUST.METHODS[hclust.method])
## 
## Cluster method   : ward.D 
## Distance         : pearson 
## Number of objects: 20
PCASamples(meth_unite)

Percent methylation matrix (rows=region/base, columns=sample) can be extracted from methylBase object by using percMethylation function. This can be useful for downstream analyses.

Here I create % methylation matrices from the filtered object, and use it to do another cluster analysis

perc.meth=percMethylation(meth_unite, rowids=T)
hist(perc.meth, col="gray50" )

# Save % methylation df to object and .tab file 
#save(perc.meth, file = "../analyses/methylation-filtered/R-objects/perc.meth") #save object to file 
#write.table((as.data.frame(perc.meth) %>% tibble::rownames_to_column("contig")), file = here::here("analyses", "methylation-filtered", "percent-methylation-filtered.tab"), sep = '\t', na = "NA", row.names = FALSE, col.names = TRUE)

# What percentage of loci have ZERO methylation? 
100*table(perc.meth==0)[2]/table(perc.meth==0)[1] # 20% of loci unmethylated, averaged across all samples
##     TRUE 
## 23.83316
# Mean % methylation across all samples 
mean(perc.meth, na.rm=TRUE) # 62%
## [1] 70.50057